/***
This do-file creates the CPS national series by wage quartile used in our 
analysis. We process the CPS data in the most analogous way possible to our 
processing in the employment pipeline, so we apply the same methodology to 
define the wage quartiles.
***/

*-------------------------------------------------------------------------------
* Set up
*-------------------------------------------------------------------------------

* Set $root 
project figstabs, root
if (r(buildrunning)==0) include "${root}/code/config_interactive.do"

* Set globals
project, uses("${root}/code/set_globals.do")
include "${root}/code/set_globals.do"
local category "Employment"

* Create directories
cap mkdir "${root}/data/derived/CPS"
cap mkdir "${root}/results"
cap mkdir "${root}/results/Employment"
cap mkdir "${root}/results/paper numbers"
cap mkdir "${root}/results/paper numbers/`category'"

*-------------------------------------------------------------------------------
* 1 - Get multipliers
*-------------------------------------------------------------------------------

* Load Thresholds
project, uses("${root}/data/dvc/Employment/poverty_thresholds.dta")
use "${root}/data/dvc/Employment/poverty_thresholds.dta", clear 

* Get multipliers
cap drop multiplier 
	foreach poverty in 100 150 250 {
		
		* Create variable 
		gen multiplier_`poverty' = .

		if `poverty' == 150 local levels = "19 20 21 22 23"
		else if `poverty' == 100 local levels = "13 14 15 16"

		foreach level of local levels {
			replace multiplier_`poverty' = poverty_`poverty' - `=`level'-0.5' if inrange(poverty_`poverty', `=`level'-0.5', `=`level'+0.5')
			replace multiplier_`poverty' = multiplier_`poverty' - 1 if inrange(poverty_`poverty', `=`level'', `=`level'+0.5')
		}
	}
	
tempfile multiplier
save `multiplier'

*-------------------------------------------------------------------------------
* 2 - Import CPS and apply sample restrictions
*-------------------------------------------------------------------------------

* Load CPS data
project, uses("${root}/data/dvc/CPS/cps_00037.dta")
use "${root}/data/dvc/CPS/cps_00037.dta", clear 

* Sample restrictions
assert !mi(age, year)
keep if age >= 16
keep if year > 2019

* Create NAICS
gen naics = . 
replace naics = 11 if inrange(ind, 0170, 0290)
replace naics = 21 if inrange(ind, 0370, 0490)
replace naics = 23 if inrange(ind, 0770, 0770)
replace naics = 31 if inrange(ind, 1070, 1790)
replace naics = 32 if inrange(ind, 1870, 2590)
replace naics = 33 if inrange(ind, 2670, 3990)
replace naics = 42 if inrange(ind, 4070, 4590)
replace naics = 44 if inrange(ind, 4670, 5190)
replace naics = 45 if inrange(ind, 5275, 5790)
replace naics = 48 if inrange(ind, 6070, 6290)
replace naics = 49 if inrange(ind, 6370, 6390)
replace naics = 22 if inrange(ind, 0570, 0690)
replace naics = 51 if inrange(ind, 6470, 6780)
replace naics = 52 if inrange(ind, 6870, 6992)
replace naics = 53 if inrange(ind, 7070, 7190)
replace naics = 54 if inrange(ind, 7270, 7490)
replace naics = 55 if inrange(ind, 7570, 7570)
replace naics = 56 if inrange(ind, 7580, 7790)
replace naics = 61 if inrange(ind, 7860, 7890)
replace naics = 62 if inrange(ind, 7970, 8470)
replace naics = 71 if inrange(ind, 8560, 8590)
replace naics = 72 if inrange(ind, 8660, 8690)
replace naics = 81 if inrange(ind, 8770, 9290)
replace naics = 92 if inrange(ind, 9370, 9890)
						
* Be consistent with PIE and CES series
gen naics_code = ""
replace naics_code = "11" if naics == 11
replace naics_code = "21" if naics == 21
replace naics_code = "22" if naics == 22
replace naics_code = "23" if naics == 23
replace naics_code = "3133" if naics == 31 | naics == 32 | naics == 33
replace naics_code = "42" if naics == 42
replace naics_code = "4445" if naics == 44 | naics == 45
replace naics_code = "4849" if naics == 48 | naics == 49
replace naics_code = "51" if naics == 51
replace naics_code = "52" if naics == 52
replace naics_code = "53" if naics == 53
replace naics_code = "54" if naics == 54
replace naics_code = "55" if naics == 55
replace naics_code = "56" if naics == 56
replace naics_code = "61" if naics == 61
replace naics_code = "62" if naics == 62
replace naics_code = "71" if naics == 71
replace naics_code = "72" if naics == 72
replace naics_code = "81" if naics == 81
				
* Drop some sectors according to BLS adjustment 
drop if naics == 92 		// drop those working in public sector to match CES (Total Private Employment)
drop if naics == 11 		// drop those working in agriculture, forestry, fishing, and hunting according to BLS adjustment of CPS to CES
drop if naics == 9290 		// drop workers in private households such as nannies, housekeepers, etc.
			
* Drop some classes of workers according to BLS adjustment
drop if inlist(classwkr, 0, 13, 25, 26, 27, 28, 29) 	// drop missing (0), unincorporated, self-employed (13), and all public sector employees (25-29) 
			
* Keep those with jobs 
keep if empstat == 10
			
* Convert to super sector
gen naics_ss = ""
replace naics_ss = "10" if inlist(naics_code, "11", "21")
replace naics_ss = "20" if inlist(naics_code, "23")
replace naics_ss = "30" if inlist(naics_code, "31-33")
replace naics_ss = "40" if inlist(naics_code, "42", "44-45", "48-49", "22")
replace naics_ss = "50" if inlist(naics_code, "51")
replace naics_ss = "55" if inlist(naics_code, "52", "53")
replace naics_ss = "60" if inlist(naics_code, "54", "55", "56")
replace naics_ss = "65" if inlist(naics_code, "61", "62")
replace naics_ss = "70" if inlist(naics_code, "71", "72")
replace naics_ss = "80" if inlist(naics_code, "81")
			
* Reformat NAICS codes  
replace naics_code = subinstr(naics_code, "-", "_", .) 
			
* Define hourly wages
cap drop wage
replace earnweek = . if earnweek > 9999 
replace uhrswork1 = . if uhrswork1 > 996
replace hourwage = . if hourwage > 999
replace hourwage = earnweek / uhrswork1 if mi(hourwage) & paidhour == 2  		// if paid hourly, divide weekly earnings by amount of hours usually worked
gen wage = hourwage if paidhour == 2
replace wage = earnweek / uhrswork1 if paidhour == 1

replace wage = 100 if wage > 100 & !mi(wage)
replace wage = 5 if wage < 5

gen date = mdy(month, 15, year)
format date %td

*-------------------------------------------------------------------------------
* 3 - Merge poverty thresholds 
*-------------------------------------------------------------------------------

project, uses("${root}/data/dvc/Employment/poverty_thresholds.dta")
merge m:1 date using "${root}/data/dvc/Employment/poverty_thresholds.dta", assert(2 3) keep(3) nogen

*-------------------------------------------------------------------------------
* 4 - Gen quartiles
*-------------------------------------------------------------------------------

gen quartile = 1 if wage <= poverty_100 & !mi(wage)
replace quartile = 2 if wage > poverty_100 & wage <= poverty_150 & !mi(wage)
replace quartile = 3 if wage > poverty_150 & wage <= poverty_250 & !mi(wage)
replace quartile = 4 if wage > poverty_250 & !mi(wage)

*-------------------------------------------------------------------------------
* 5 - Smooth round numbers
*-------------------------------------------------------------------------------

* Get mass at integer wages
gen count = 1

foreach level in 13 14 15 16 19 20 21 22 23 {
preserve
collapse (sum) mass_`level' = count if wage == `level' [pw=earnwt],  by(date)
tempfile mass_`level'
save `mass_`level''
restore
}

* Collapse 
collapse (sum) employment_cps = count  [pw=earnwt], by(date quartile)  

* Get poverty thresholds again 
project, uses("${root}/data/dvc/Employment/poverty_thresholds.dta")
merge m:1 date using "${root}/data/dvc/Employment/poverty_thresholds.dta", keep(3) nogen 

* Get multipliers  
merge m:1 date using `multiplier', keep(3) nogen 

* Merge in Mass
foreach num in 13 14 15 16 19 20 21 22 23 {
merge m:1 date using `mass_`num'', assert(3) nogen 
}

* Add in multiplier * mass 
gen adjustment_100 = 0 
gen adjustment_150 = 0 

foreach level in 13 14 15 16 {
replace adjustment_100 = multiplier_100*mass_`level' if inrange(poverty_100, `level'-0.4999999999, `level'+0.5)
}

foreach level in 19 20 21 22 23{
replace adjustment_150 = multiplier_150*mass_`level' if inrange(poverty_150, `level'-0.4999999999, `level'+0.5)

}

rename (adjustment_100 adjustment_150) (adjustment_1 adjustment_2)

gegen cell = group(quartile date)

* Fix 
foreach level in 1 2  {
	bys cell: gegen temp_`level' = mean(adjustment_`level')
	replace adjustment_`level' = temp_`level'
}
	
* Make smoothed variable
gen emp_cps_smooth = . 
replace emp_cps_smooth = employment_cps + adjustment_1 if quartile == 1 
replace emp_cps_smooth = employment_cps + adjustment_2 - adjustment_1 if quartile == 2 
replace emp_cps_smooth = employment_cps - adjustment_2 if quartile == 3
replace emp_cps_smooth = employment_cps if mi(emp_cps_smooth)

*-------------------------------------------------------------------------------
* 6 - Norm
*-------------------------------------------------------------------------------

gen jan = emp_cps_smooth if month(date) == 1 & year(date) == 2020 
bys quartile: gegen base = mean(jan) 
gen norm_emp_cps = 100 * (emp_cps_smooth / base - 1) 
drop if mi(quartile)
drop base jan

sort quartile date 

keep date quartile norm_emp_cps employment_cps

save "${root}/data/derived/CPS/CPS by wage quartile.dta", replace
project, creates("${root}/data/derived/CPS/CPS by wage quartile.dta")

*-------------------------------------------------------------------------------
* Scalars for paper: wage thresholds and % employed in each wage bin in Jan 2020
*-------------------------------------------------------------------------------

cap erase "${root}/results/paper numbers/`category'/CPS Employment per Wage Quartile.yaml"

* % employed in each wage bin in Jan 2020
keep if date == mdy(1,15,2020)
egen total_employment_cps = total(employment_cps)
gen pct_employment_cps = employment_cps / total_employment_cps * 100

* Export scalars 
forval quartile = 1/4 {
	
	* Assert 1 obs per quartile 
	count if quartile == `quartile'
	assert r(N) == 1
	
	sum pct_employment_cps if quartile == `quartile'
	local pct_emp_cps_q`quartile': di %3.1f r(mean)

	yamlout using "${root}/results/paper numbers/`category'/CPS Employment per Wage Quartile.yaml", ///
		key("pct_emp_cps_q`quartile'") ///
		comment("Percent employed in each wage quartile in CPS Jan 2020") ///
		value(`pct_emp_cps_q`quartile'') fmt(%3.1f)
}

* Wage thresholds in Jan 2020
project, uses("${root}/data/dvc/Employment/poverty_thresholds.dta")
use "${root}/data/dvc/Employment/poverty_thresholds.dta", clear
keep if date == mdy(1,15,2020)
count
assert r(N) == 1

sum poverty_100 
local wage_threshold_q1q2: di %4.2f r(mean)

sum poverty_150 
local wage_threshold_q2q3: di %4.2f r(mean)

sum poverty_250
local wage_threshold_q3q4: di %4.2f r(mean)

yamlout using "${root}/results/paper numbers/`category'/CPS Employment per Wage Quartile.yaml", ///
	key("wage_threshold_q1q2") ///
	comment("Jan 2020 Q1-Q2 Wage Threshold") ///
	value(`wage_threshold_q1q2') fmt(%4.2f)

yamlout using "${root}/results/paper numbers/`category'/CPS Employment per Wage Quartile.yaml", ///
	key("wage_threshold_q2q3") ///
	comment("Jan 2020 Q2-Q3 Wage Threshold") ///
	value(`wage_threshold_q2q3') fmt(%4.2f)

yamlout using "${root}/results/paper numbers/`category'/CPS Employment per Wage Quartile.yaml", ///
	key("wage_threshold_q3q4") ///
	comment("Jan 2020 Q3-Q4 Wage Threshold") ///
	value(`wage_threshold_q3q4') fmt(%4.2f)

project, creates("${root}/results/paper numbers/`category'/CPS Employment per Wage Quartile.yaml")
